{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "content = []\n",
    "with open('data/simp/train.tsv') as f:\n",
    "    content.append(f.readlines())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18770\n"
     ]
    }
   ],
   "source": [
    "print(len(content[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17770\n",
      "T\t党国元老张群王世杰及罗家伦夫人\n",
      "\n",
      "1000\n",
      "T\t却集合了很多与众不同的地方\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test = content[0][-1000:]\n",
    "train = content[0][:-1000]\n",
    "print(len(train))\n",
    "print(train[0])\n",
    "print(len(test))\n",
    "print(test[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('data/simp/test.tsv', 'w') as the_file:\n",
    "    for i in test:\n",
    "        the_file.write(i)\n",
    "        \n",
    "with open('data/simp/train1.tsv', 'w') as the_file:\n",
    "    for i in train:\n",
    "        the_file.write(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     0                                        1\n",
      "0    T                            却集合了很多与众不同的地方\n",
      "1    M                         给亏困停工半停工企业以扶持和帮助\n",
      "2    T                           随时可能发生在自己的生活周遭\n",
      "3    T                           固然身受经济不景气影响而受挫\n",
      "4    M                          只有去凉城的３２位旅客在等待着\n",
      "5    M  聚集着河南安阳市辖区内所有乡镇的党委书记优秀党员代表和自动赶来的群众２００多人\n",
      "6    T                              甚至就主政者的观点来说\n",
      "7    M                   很快得到北约其他成员国的支持和华约国家的响应\n",
      "8    M                               昨晚在车上休息得不错\n",
      "9    T                          以及高雄土地资产即将处理的题材\n",
      "10   M                              当地产的罗非鱼鲟鱼鳜鱼\n",
      "11   T                     有补习班开出一学期只收两万五千元的低收费\n",
      "12   M                               佐藤还因洛案被判过刑\n",
      "13   M                               经过激战和大量的工作\n",
      "14   T                  在此之前于前任县长陈进兴任内的县治迁建弊端调查\n",
      "15   M                        该行政长官认为他继续留下来是有害的\n",
      "16   T                              学生受此僵化教育的影响\n",
      "17   M                     当历史的脚步匆匆跨入本世纪最后十年的时候\n",
      "18   T                              各校的刊物皆可放置在此\n",
      "19   T                     所带来的压力便会更促进基本技能训练之成长\n",
      "20   M                            一个小铁人的绰号便不胫而走\n",
      "21   T                            对社会有热心参与之心的型中\n",
      "22   M                             ２７０多个日日夜夜过去了\n",
      "23   M                     南非有识之士都认为种族主义制度必须结束了\n",
      "24   M                         水下基础部分已完成百分之七十以上\n",
      "25   M                            电视塔工地依旧干得热火朝天\n",
      "26   T                      今年的迎新活动四日在久久酒一次茶馆举行\n",
      "27   T                          昨天第一名古金水跳完五公尺一十\n",
      "28   M                华远公司准备在夜市期间举办小麻雀办公电脑百人公开赛\n",
      "29   T                                师生都已经拟亲人了\n",
      "..  ..                                      ...\n",
      "970  M                            由北向南铺开的两排长条凳上\n",
      "971  T                             每日盘中震荡幅度愈来愈小\n",
      "972  T                             以及对女儿独立性格的鼓励\n",
      "973  T                               更往往是这份实践精神\n",
      "974  M                浙江省各级农业银行信用社扎扎实实为农业升温添薪加油\n",
      "975  T      男单由北京亚运第二单打印尼的阿迪及世界排名第一的马来西亚选手傅国强挂帅\n",
      "976  M                      社会主义经济发展与精神文明建设是同步的\n",
      "977  M                             思想上的疙瘩就容易解开了\n",
      "978  T                          郭立诚收信当天立即约林汉章见面\n",
      "979  M                         然后在零号块的两旁用模子逐段浇筑\n",
      "980  M                                我们默默地站在墓前\n",
      "981  T                            竟是元朝至正六年刻的金刚经\n",
      "982  T                         不但庭硕轻松愉快的学会了注音符号\n",
      "983  T                              希望能藉此解放教育制度\n",
      "984  M                                道出了老板们的忧虑\n",
      "985  T                        至于过程是否就读教育部不认可的学校\n",
      "986  T                          全民健保法从政策规划到立法过程\n",
      "987  M                          中年男子的神情中流露出一阵惊慌\n",
      "988  T                              校方站在爱护学生的立场\n",
      "989  M                    李鹏总理在世界经济论坛年会上向世界庄严宣布\n",
      "990  M                              那已是一九八五年的时候\n",
      "991  M                      所以我们想兰州三毛产品在东北要占领市场\n",
      "992  M                               也要动员乡上找点房子\n",
      "993  M                             一直驶到下榻的迎宾馆桂园\n",
      "994  M                              一位产妇和新生儿得救了\n",
      "995  M                             认购数一般在两三千元左右\n",
      "996  M                        站在坝上的领导同志也禁不住泪如泉涌\n",
      "997  M                           积压最多的还是些外省市的报纸\n",
      "998  M                               把科技转化请到帅位上\n",
      "999  T                     是环泰企业桃园厂生产的雪莉土豆仁牛奶冰棒\n",
      "\n",
      "[1000 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_csv('data/simp/test.tsv', header=None, delimiter='\\t')\n",
    "print(data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
